Data Structure - dataset contains 1000 observations with 16 categorical and numerical variables.
Itemized List of Feature Variables:
Problem Statements: Report aim to analyse Loan Default Dataset to estimate the probaility of an accounts going into default using other variables as predictor and henc enhance loans underwriting.
The report aim to look at the accounts distribution by different demographic in order to identify outliers and asses the impact and correlation between demographic characteristics and probability of account going into default.
LDD <- read.csv("https://raw.githubusercontent.com/sameralzaim/W02/refs/heads/main/BankLoanDefaultDataset.csv")
#View (LDD)
library(gridExtra) # Load the package before using grid.arrange()
ldd <- LDD %>%
mutate(Age_Group = cut(Age,
breaks = c(18, 25, 35, 45, 55, 65, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56-65", "65+"),
right = FALSE)) # Right = FALSE means 25 is in "18-25"
# Create bar plot for Age Group
age_plot <- ggplot(ldd, aes(x = Age_Group, fill = Age_Group)) +
geom_bar(fill = "navy") +
labs(title = "No. of Acct by Age",
x = "Age Group",
y = "Count of Accounts") +
theme_minimal() +
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Group Employment Duration into bins
ldd <- ldd %>%
mutate(Emp_duration = cut(Emp_duration,
breaks = c(0, 12, 24, 48, 96, 120, Inf),
labels = c("< 1", "1-2", "2-4", "4-8", "8-10", "10+"),
right = FALSE)) # Right = FALSE means 24 is in "12-24"
# Create bar plot for Employment Duration
emp_plot <- ggplot(ldd, aes(x = Emp_duration, fill = Emp_duration)) +
geom_bar(fill = "navy") +
labs(title = "No. of Acct by Emp Years",
x = "Employment Years",
y = "Count of Accounts") +
theme_minimal() +
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Create bar plot for Loan Term
term_plot <- ggplot(ldd, aes(x = Term, fill = Term)) +
geom_bar(fill = "navy") +
labs(title = "No. of Acct by Loan Term",
x = "Loan Term",
y = "Count of Accounts") +
theme_minimal() +
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
ldd <- LDD %>%
mutate(No_of_credit_acc = cut(No_of_credit_acc,
breaks = c(1, 3, 5, 7, Inf),
labels = c("1", "1-3", "3-5", "7+"),
right = FALSE)) # Right = FALSE means 25 is in "18-25"
# Create bar plot for No. of credit accounts
credit_plot <- ggplot(ldd, aes(x = No_of_credit_acc, fill = No_of_credit_acc)) +
geom_bar(fill = "navy") +
labs(title = "No. of Acct by No of credit acc",
x = "No of Credit Acc",
y = "Count of Accounts") +
theme_minimal() +
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Arrange the three plots side by side
grid.arrange(age_plot, emp_plot, term_plot,credit_plot, ncol = 4)
library(gridExtra)
# Bar plot for Default
Default_plot <- ggplot(LDD, aes(x = as.factor(Default))) +
geom_bar(fill = "brown2", color = "black") +
labs(title = "No. of Acct by Default",
x = "Default",
y = "Number of Accounts") +
theme_minimal()+
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Bar plot for Gender
gender_plot <- ggplot(LDD, aes(x = Gender)) +
geom_bar(fill = "cyan4", color = "black") +
labs(title = "No. of Acct by Gender",
x = "Gender",
y = "Number of Accounts") +
theme_minimal()+
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Bar plot for Marital Status
marital_plot <- ggplot(LDD, aes(x = Marital_status)) +
geom_bar(fill = "cyan4", color = "black") +
labs(title = "No. of Acct by Marital Status",
x = "Marital Status",
y = "Number of Accounts") +
theme_minimal()+
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Bar plot for Employment Status
emp_status_plot <- ggplot(LDD, aes(x = Emp_status)) +
geom_bar(fill = "cyan4", color = "black") +
labs(title = "No. of Acct by Emp Status",
x = "Employment Status",
y = "Number of Accounts") +
theme_minimal()+
theme(plot.title = element_text(size = 9, face = "bold", hjust = 0.5), legend.position = "none") # Remove redundant legend
# Arrange the three plots side by side
grid.arrange(gender_plot, marital_plot, emp_status_plot,Default_plot, ncol = 4)
my_data <- as.data.frame(LDD)
# Add row index for plotting
my_data$row_id <- seq_len(nrow(my_data))
# Convert missing values into a binary indicator
missing_data <- my_data %>%
mutate(across(everything(), ~ ifelse(is.na(.), 1, 0))) %>%
pivot_longer(cols = -row_id, names_to = "Variable", values_to = "Missing")
# Plot missing values as a heatmap
ggplot(missing_data, aes(x = Variable, y = row_id)) +
geom_tile(aes(fill = as.factor(Missing))) +
scale_fill_manual(values = c("white", "red"), labels = c("Present", "Missing")) +
labs(title = "Missing Data Heatmap", x = "Variables", y = "Observations") +
theme_minimal()
ldd_long <- LDD %>%
mutate(across(everything(), as.character)) %>% # Convert all columns to character
pivot_longer(cols = -Default, names_to = "Variable", values_to = "Value")
# Generate distribution plots for each variable by Default
ggplot(ldd_long, aes(x = Value, fill = as.factor(Default))) +
geom_bar(alpha = 0.7, position = "dodge") + # Bar plot for categorical values
facet_wrap(~ Variable, scales = "free") + # Separate plots for each variable
labs(title = "Distribution of Variables by number Defaulted Accounts",
x = "Value",
y = "Default",
fill = "Default") +
theme_minimal()
Distribution of Default Accounts by Gender and Marital status
ds <- LDD
ds %>%
filter(Default %in% c(0, 1)) %>%
group_by(Gender, Marital_status, Default) %>%
summarise(n = n(), .groups = "drop") %>%
ggplot(aes(x = Gender, y = n, fill = factor(Default))) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(. ~ Marital_status) + # Side-by-side graphs
ggtitle("Number of Accounts with Default (0 vs 1) by Gender and Marital Status") +
labs(fill = "Default", x = "Gender", y = "Number of Accounts") +
scale_fill_manual(values = c("1" = "brown2", "0" = "cyan4")) + # Balanced Red-Green colors
theme_minimal() +
theme(
panel.background = element_rect(fill = "#F0F0F0", color = NA), # Gray inside graph only
plot.background = element_blank(), # Keep x/y labels and titles clean (no gray)
strip.background = element_rect(fill = "#D3D3D3", color = "#D3D3D3"), # Lighter gray facet headers
strip.text = element_text(color = "black", face = "bold"), # Black bold text for headers
panel.spacing.x = unit(2, "lines"), # THICK white separation between graphs
panel.grid.major = element_line(color = "white", size = 0.7), # Thin white major grid lines
panel.grid.minor = element_line(color = "white", size = 0.7) # Thin white minor grid lines
)
for the sake of the analysis we will create missing values in saving amount and calculate the missing values
sum(is.na(lddm$Saving_amount))
[1] 90
We start by pairing the Variable to get sense of the potential relationship. From below we can see that Amount and saving acount has similar distribution. Same observation can be seen also with marital status.
# Fit a linear model
pred.mdl <- lm(Saving_amount ~ Emp_duration + Marital_status + Amount + Credit_score,
data = lddm, na.action = na.exclude)
# Identify missing values in Saving_amount
missing_idx <- which(is.na(lddm$Saving_amount)) # Get missing value indices
# Predict Saving_amount only for missing values
pred.Saving_amount <- predict(pred.mdl, newdata = lddm[missing_idx, ])
# Assign predicted values to missing rows
lddm$Saving_amount[missing_idx] <- pred.Saving_amount
# Ensure prediction length matches missing values count
if (length(pred.Saving_amount) != length(missing_idx)) {
stop("Prediction length does not match the number of missing values.")
}
# Get residuals from the linear model
pred.resid <- resid(pred.mdl)
# Ensure enough residuals are available for sampling
m0 <- length(pred.Saving_amount)
if (length(pred.resid) >= m0) {
pred.yrand <- pred.Saving_amount + sample(pred.resid, m0, replace = TRUE)
} else {
pred.yrand <- pred.Saving_amount # Use deterministic values as fallback
}
# Assign the final imputed values
lddm$Saving_amount[missing_idx] <- pred.yrand
# Check if all missing values are imputed
remaining_na <- sum(is.na(lddm$Saving_amount))
# Scatter plot
plot(lddm$Checking_amount, lddm$Saving_amount,
main = "Saving_amount vs Checking_amount",
xlab = "Checking Amount", ylab = "Saving Amount", col = "gray")
# Add regression-imputed points (red)
points(lddm$Checking_amount[missing_idx], pred.Saving_amount, pch = 19, col = "red")
# Add random regression-imputed points (blue)
points(lddm$Checking_amount[missing_idx], pred.yrand, pch = 19, col = "blue")
# Add legend
legend("topleft", legend = c("Regression Imputation", "Random Regression Imputation"),
col = c("red", "blue"), pch = rep(19, 2), bty = "n", cex = 0.8)
# Update the dataset with the imputed values for Saving_amount
missing_idx <- which(is.na(lddm$Saving_amount))
lddm$Saving_amount[missing_idx] <- pred.Saving_amount[missing_idx]
# Count missing values after imputation
missing_after <- sum(is.na(lddm$Saving_amount))
print(paste("Missing values after imputation:", missing_after))
[1] "Missing values after imputation: 0"
# Load necessary library
# Fit a full logistic regression model with all predictors
full_model <- glm(Default ~ ., data = lddm, family = binomial)
# Perform stepwise selection (default is backward elimination)
step_model <- step(full_model, direction = "both")
Start: AIC=346.23
Default ~ Checking_amount + Term + Credit_score + Gender + Marital_status +
Car_loan + Personal_loan + Home_loan + Education_loan + Emp_status +
Amount + Saving_amount + Emp_duration + Age + No_of_credit_acc
Df Deviance AIC
- Gender 1 314.24 344.24
- Car_loan 1 314.30 344.30
- Education_loan 1 314.33 344.33
- Marital_status 1 314.40 344.40
- Personal_loan 1 314.60 344.60
- Emp_duration 1 314.85 344.85
- No_of_credit_acc 1 315.24 345.24
<none> 314.23 346.23
- Home_loan 1 316.30 346.30
- Amount 1 316.80 346.80
- Emp_status 1 317.45 347.45
- Term 1 326.99 356.99
- Credit_score 1 347.61 377.61
- Saving_amount 1 391.55 421.55
- Checking_amount 1 391.88 421.88
- Age 1 547.07 577.07
Step: AIC=344.24
Default ~ Checking_amount + Term + Credit_score + Marital_status +
Car_loan + Personal_loan + Home_loan + Education_loan + Emp_status +
Amount + Saving_amount + Emp_duration + Age + No_of_credit_acc
Df Deviance AIC
- Car_loan 1 314.30 342.30
- Education_loan 1 314.34 342.34
- Marital_status 1 314.51 342.51
- Personal_loan 1 314.61 342.61
- Emp_duration 1 314.86 342.86
- No_of_credit_acc 1 315.24 343.24
<none> 314.24 344.24
- Home_loan 1 316.31 344.31
- Amount 1 316.82 344.82
- Emp_status 1 317.47 345.47
+ Gender 1 314.23 346.23
- Term 1 327.17 355.17
- Credit_score 1 347.65 375.65
- Saving_amount 1 391.55 419.55
- Checking_amount 1 391.91 419.91
- Age 1 547.23 575.23
Step: AIC=342.3
Default ~ Checking_amount + Term + Credit_score + Marital_status +
Personal_loan + Home_loan + Education_loan + Emp_status +
Amount + Saving_amount + Emp_duration + Age + No_of_credit_acc
Df Deviance AIC
- Marital_status 1 314.57 340.57
- Emp_duration 1 314.90 340.90
- No_of_credit_acc 1 315.29 341.29
<none> 314.30 342.30
- Amount 1 316.89 342.89
- Emp_status 1 317.56 343.56
+ Car_loan 1 314.24 344.24
+ Gender 1 314.30 344.30
- Personal_loan 1 321.57 347.57
- Education_loan 1 321.92 347.92
- Term 1 327.20 353.20
- Home_loan 1 330.46 356.46
- Credit_score 1 347.70 373.70
- Checking_amount 1 391.92 417.92
- Saving_amount 1 392.12 418.12
- Age 1 547.68 573.68
Step: AIC=340.57
Default ~ Checking_amount + Term + Credit_score + Personal_loan +
Home_loan + Education_loan + Emp_status + Amount + Saving_amount +
Emp_duration + Age + No_of_credit_acc
Df Deviance AIC
- Emp_duration 1 315.02 339.02
- No_of_credit_acc 1 315.72 339.72
<none> 314.57 340.57
- Amount 1 317.09 341.09
- Emp_status 1 317.61 341.61
+ Marital_status 1 314.30 342.30
+ Gender 1 314.46 342.46
+ Car_loan 1 314.51 342.51
- Personal_loan 1 321.76 345.76
- Education_loan 1 322.27 346.27
- Term 1 328.06 352.06
- Home_loan 1 330.65 354.65
- Credit_score 1 347.91 371.91
- Checking_amount 1 392.16 416.16
- Saving_amount 1 392.70 416.70
- Age 1 548.37 572.37
Step: AIC=339.02
Default ~ Checking_amount + Term + Credit_score + Personal_loan +
Home_loan + Education_loan + Emp_status + Amount + Saving_amount +
Age + No_of_credit_acc
Df Deviance AIC
- No_of_credit_acc 1 316.03 338.03
<none> 315.02 339.02
- Amount 1 317.60 339.60
- Emp_status 1 318.45 340.45
+ Emp_duration 1 314.57 340.57
+ Marital_status 1 314.90 340.90
+ Gender 1 314.96 340.96
+ Car_loan 1 314.98 340.98
- Personal_loan 1 322.17 344.17
- Education_loan 1 322.56 344.56
- Term 1 328.53 350.53
- Home_loan 1 331.25 353.25
- Credit_score 1 347.93 369.93
- Checking_amount 1 392.17 414.17
- Saving_amount 1 392.87 414.87
- Age 1 549.80 571.80
Step: AIC=338.03
Default ~ Checking_amount + Term + Credit_score + Personal_loan +
Home_loan + Education_loan + Emp_status + Amount + Saving_amount +
Age
Df Deviance AIC
<none> 316.03 338.03
- Amount 1 318.52 338.52
+ No_of_credit_acc 1 315.02 339.02
- Emp_status 1 319.07 339.07
+ Emp_duration 1 315.72 339.72
+ Marital_status 1 315.78 339.78
+ Gender 1 315.91 339.91
+ Car_loan 1 316.01 340.01
- Personal_loan 1 323.38 343.38
- Education_loan 1 323.46 343.46
- Term 1 329.33 349.33
- Home_loan 1 332.49 352.49
- Credit_score 1 348.73 368.73
- Checking_amount 1 394.33 414.33
- Saving_amount 1 395.76 415.76
- Age 1 552.47 572.47
From the all model logistic regression step above, we got our primary model and we use cross-validation is to compare performance between this model and another challenger model to arrive at optimal model selection, we will use the following two logistic regression models and use the 5-fold cross-validation method to identify the optimal model.
Model 1: Default = α0+ α1×Checking_amount + α2×Term + α3×Credit_score + α4×Personal_loan + α5×Home_loan + α6×Education_loan + α7×Emp_status + α8×Saving_amount + α9×Age
Model 2: Default = α0+ α1×Credit_score + α2×Home_loan + α3×Education_loan + α4×Emp_status + α5×Saving_amount + α6×Age
Model Selection via 5-fold Cross-Validation
We use 5-fold cross-validation to select the better one from Model 1 and Model 2. 5-fold cross-validation: splitting the training set into 5 folds with equal size to perform cross-validation.
Data Splitting: Using random splitting to partition the data into training set (700) and testing set (300).
Choose Performance Measure: The resulting model will be used to predict the Default. We use the mean square error to measure the predictive performance.
# Load the data
data(LDD)
#LDD$am <- as.factor(mtcars$am) # Convert to factor for logistic regression
# Fit a logistic regression model
model_glm <- glm(Default ~ Checking_amount + Term + Credit_score + Personal_loan +
Home_loan + Education_loan + Emp_status + Saving_amount +
Age, family = binomial, data = LDD)
# Summary of the model
summary(model_glm)
Call:
glm(formula = Default ~ Checking_amount + Term + Credit_score +
Personal_loan + Home_loan + Education_loan + Emp_status +
Saving_amount + Age, family = binomial, data = LDD)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 39.9793055 3.8250697 10.452 < 2e-16 ***
Checking_amount -0.0051043 0.0006725 -7.590 3.21e-14 ***
Term 0.1714373 0.0515333 3.327 0.000879 ***
Credit_score -0.0107846 0.0020565 -5.244 1.57e-07 ***
Personal_loan -0.9157256 0.3307825 -2.768 0.005634 **
Home_loan -2.8298877 0.7711243 -3.670 0.000243 ***
Education_loan 1.3059092 0.5483430 2.382 0.017240 *
Emp_statusunemployed 0.5855966 0.3339530 1.754 0.079511 .
Saving_amount -0.0047747 0.0006035 -7.912 2.54e-15 ***
Age -0.6504636 0.0632505 -10.284 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1221.7 on 999 degrees of freedom
Residual deviance: 301.9 on 990 degrees of freedom
AIC: 321.9
Number of Fisher Scoring iterations: 7
# Install plotly if not already installed
if (!requireNamespace("plotly", quietly = TRUE)) {
install.packages("plotly")
}
# Load the package
library(plotly)
# using sample() to perform random splitting
train.ID = sample(1:dim(lddm)[1], 700, replace = FALSE) # without replacement
# training set
train = lddm[train.ID,]
test = lddm[-train.ID,]
## splitting the train set into 5 folds to train and validate the candidate models
N = dim(train)[1] # size of training data
k = 5 # number of folds
fld.n = ceiling(N/k)
MSE.m1 = NULL # null vector to store MSE
MSE.m2 = NULL
for (i in 1:k){
valid.ID = ((i-1)*fld.n +1):(i*fld.n) # observation ID for the i-th validation set
valid.set = train[valid.ID, ]
train.set = train[-valid.ID,]
## fitting two candidate models with combined 4 folds of data set
M01 = glm(Default ~ Checking_amount + Term + Credit_score + Personal_loan + Home_loan + Education_loan + Emp_status + Saving_amount + Age, family = binomial, data = train.set)
M02 = glm(Default ~ Credit_score + Home_loan + Education_loan + Emp_status + Saving_amount + Age, family = binomial, data = train.set)
## Predicting Default using the two candidate models based on the validate set
predM01 = predict(M01, newdata = valid.set)
predM02 = predict(M02, newdata = valid.set)
## calculating the MSE associated with the two models
MSE.m1[i] = mean((predM01 - valid.set$Default)^2)
MSE.m2[i] = mean((predM02 - valid.set$Default)^2)
}
## define a data frame to store the MSE of the candidate models
##
MSE = data.frame(fold = rep(1:k,2), MSE = c(MSE.m1, MSE.m2), type=c(rep("Model 1",k), rep("Model 2", k)))
## line plots of the
cvplot = ggplot(data = MSE, aes(x=fold, y=MSE, color = type)) +
geom_line() +
geom_point() +
coord_cartesian(xlim = c(0, 6),
ylim = c(0,40)) +
geom_text(mapping = aes(x=2.0, y=5,
label=paste("Model 1 Mean MSE: = ", round(mean(MSE.m1),3), "")),
hjust=0) +
geom_text(mapping = aes(x=2.0, y=10,
label=paste("Model 2 Mean MSE: = ", round(mean(MSE.m2),3), "")),
hjust=0) +
ggtitle("Line plots of MSE candidate Models across folds") +
theme(plot.title = element_text(hjust = 0.5),
plot.margin = unit(c(1,1,1,1), "cm"))
ggplotly(cvplot)
from the above we can see that Model 2 has lower MSE and more simpler, with 6 predictors, compared to model 1 and hence we choose model 2 as Default predictor.